Q1

First I load the data files

library(tidyverse)
dat <- read.csv("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/migration2012.csv")
states <- read.csv("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/states_chord.csv")

dat <- dat %>%
tidyr::gather(key=From, value=value, Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,District.of.Columbia,Florida,Georgia,Hawaii,Idaho,Illinois,Indiana,Iowa,Kansas,Kentucky,Louisiana,Maine,Maryland,Massachusetts,Michigan,Minnesota,Mississippi,Missouri,Montana,Nebraska,Nevada,New.Hampshire,New.Jersey,New.Mexico,New.York,North.Carolina,North.Dakota,Ohio,Oklahoma,Oregon,Pennsylvania,Rhode.Island,South.Carolina,South.Dakota,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West.Virginia,Wisconsin,Wyoming)

Delete the puctuations.

dat <- data.frame(apply(dat, 2, function(y) gsub("[[:punct:]]", " ", y)))
dat

Then I add two new columns to the above data frame.

d1 <- merge(dat, states, by.x="From", by.y="States")
d1 <- d1[,c(-5,-6)]
names(d1)[4] <- "ID_From"
d2 <- merge(d1, states, by.x="To", by.y="States")
d2 <- d2[,c(-6,-7)]
names(d2)[5] <- "ID_To"
dat <- d2
dat

Sort the data frame in the ascending order

dat <- arrange(dat, ID_From)
dat <- arrange(dat, ID_To)
dat

Convert the above data frame into the matrix form

t1 <- dat[,c(1,2,4,5)]
t1 <- t1 %>%
  spread(key=To, value=ID_To)
t2 <- dat[,c(1,2,3,4)]
t2 <- t2 %>%
  spread(key=To, value=value)
t3 <- rbind(t1[1,],t2)
t3$From <- as.character(t3$From)
t3[1,2] <- 0
t3[1,1] <- "ID_To"
t3 <- t3 %>%
  arrange(ID_From)
library(data.table)
data.table 1.10.4
**********
This installation of data.table has not detected OpenMP support. It will still work but in single-threaded mode. If this a Mac and you obtained the Mac binary of data.table from CRAN, CRAN's Mac does not yet support OpenMP. In the meantime please follow our Mac installation instructions on the data.table homepage. If it works and you observe benefits from multiple threads as others have reported, please convince Simon Ubanek by sending him evidence and ask him to turn on OpenMP support when CRAN builds package binaries for Mac. Alternatives are to install Ubuntu on your Mac (which I have done and works well) or use Windows where OpenMP is supported and works well.
**********
  The fastest way to learn (by data.table authors): https://www.datacamp.com/courses/data-analysis-the-data-table-way
  Documentation: ?data.table, example(data.table) and browseVignettes("data.table")
  Release notes, videos and slides: http://r-datatable.com
-----------------------------------------------------------------------------------------------------------------------------------
data.table + dplyr code now lives in dtplyr.
Please library(dtplyr)!
-----------------------------------------------------------------------------------------------------------------------------------

 次のパッケージを付け加えます: ‘data.table’ 

 以下のオブジェクトは ‘package:dplyr’ からマスクされています: 

     between, first, last 

 以下のオブジェクトは ‘package:purrr’ からマスクされています: 

     transpose 
setcolorder(t3,c("From","ID_From","Connecticut","Maine","Massachusetts","New Hampshire","Rhode Island",
"Vermont","New Jersey","New York","Pennsylvania","Illinois","Indiana",
"Michigan","Ohio","Wisconsin","Iowa","Kansas","Minnesota",
"Missouri","Nebraska","North Dakota","South Dakota","Delaware","Florida",
"Georgia","Maryland","North Carolina","South Carolina","Virginia","District of Columbia",
"West Virginia","Alabama","Kentucky","Mississippi","Tennessee","Arkansas",
"Louisiana","Oklahoma","Texas","Arizona","Colorado","Idaho",
"Montana","Nevada","New Mexico","Utah","Wyoming","Alaska",
"California","Hawaii","Oregon","Washington"))
t3 <- t3[c(-1),c(-2)]
t4 <- t3[,-1]
rownames(t4) <- t3[,1]
t5 <- data.frame(apply(t4, 2, function(y) as.numeric(y)))
rownames(t5) <- t3[,1]
t5

Drawing the chord diagram

chorddiag(as.matrix(t5),groupColors=states$Color,showTicks=F,groupnamePadding = 20,groupThickness=.05,groupnameFontsize=10)
row names of the 'data' matrix differ from its column names or the 'groupNames' argument.

Q2

Read “Stops On Lines” and the all GIS data of bus lines.

library(dplyr)
library(sp)
library(rgdal)
library(leaflet)
library(ggmap)

# Bus Stops
SOL <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/BusStops1216","StopsOnLines1216")
SOL.pj <- spTransform(SOL, CRS("+proj=longlat +datum=WGS84"))

# Bus Routes
CC <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/ComCir1216","ComCir1216")
CC.pj <- spTransform(CC, CRS("+proj=longlat +datum=WGS84"))
LE <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/LimExp1216","LimExp1216")
LE.pj <- spTransform(LE, CRS("+proj=longlat +datum=WGS84"))
LCBD <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/LocalCBD1216","LocalCBD1216")
LCBD.pj <- spTransform(LCBD, CRS("+proj=longlat +datum=WGS84"))
LNCBD <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/LocalNonCBD1216","LocalNonCBD1216")
LNCBD.pj <- spTransform(LNCBD, CRS("+proj=longlat +datum=WGS84"))
RBRT <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/RapidBRT1216","RapidBRT1216")
RBRT.pj <- spTransform(RBRT, CRS("+proj=longlat +datum=WGS84"))

tmp_CC <- geometry(CC.pj)
tmp_LE <- geometry(LE.pj)
tmp_LCBD <- geometry(LCBD.pj)
tmp_LNCBD <- geometry(LNCBD.pj)
tmp_RBRT <- geometry(RBRT.pj)

# Individual Bus Route
pjs <- list()
tmps <- list()
layer_list <- ogrListLayers("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/Individuals1216")
for (i in layer_list){
  if (i != 728){
    a <- spTransform(readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/Individuals1216", toString(i)), CRS("+proj=longlat +datum=WGS84"))
    pjs <- c(pjs, a)
    tmps <- c(tmps, geometry(a))
  }
}

make Line_list

tmps <- c(tmps, tmp_CC, tmp_LE, tmp_LCBD, tmp_LNCBD, tmp_RBRT)
Line_list <- list()
for (i in 1:length(tmps)){
  for (j in 1:length(tmps[[i]])){
    Line_list <- c(Line_list, tmps[[i]][j]@lines[[1]]@Lines)
  }
}

make new_id

pjs <- c(pjs, CC.pj, LE.pj, LCBD.pj, LNCBD.pj, RBRT.pj)
LinLSs <- list()
for (i in 1:length(pjs)){
 LinLSs <- c(LinLSs, sapply(pjs[[i]]@lines, function(x) length(x@Lines)))
}
LinLSs <- LinLSs %>% unlist()
new_id <- sapply(1:length(LinLSs), function(x) paste0(x, "_", seq.int(LinLSs[[x]]))) %>% 
  unlist()

make data frame

## make a new data.frame (only route_id)
DAT=data.frame(matrix(rep(NA,1),nrow=1))[-1,]
for (i in 1:length(pjs)){
  df <- data.frame(route_id = pjs[[i]]@data$VAR_IDENT)
  DAT <- rbind(DAT, df)
}
rownames(DAT) <- new_id
SLDF <- mapply(function(x, y) Lines(x, ID = y), x = Line_list, y = new_id) %>%
  #list() %>%
  SpatialLines() %>% 
  SpatialLinesDataFrame(data = DAT)

make new lines and LA map

dat <- geocode('Los Angels')
Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Los%20Angels&sensor=false
leaflet() %>%
  setView(lng = dat['lon'], lat = dat['lat']    , zoom = 11) %>%
  addPolylines(data = SLDF, color = "black", opacity = 1, weight = 1) %>% 
  addCircles(data=SOL.pj@data,~LONG, ~LAT, color = "red", weight = 0.3) %>%
  addTiles()

Q3

First I load the data.

library(quantmod)
library(highcharter)

x <- getSymbols("AUD/JPY", src = "oanda", auto.assign = FALSE)
y <- getSymbols("GBP/USD", src = "oanda", auto.assign = FALSE)

Next make Bollinger’s bands for each exchange rate.

x.BBands.ll <- BBands(x)$dn
x.BBands.ul <- BBands(x)$up
x.BBands.m <- BBands(x)$mavg
y.BBands.ll <- BBands(y)$dn
y.BBands.ul <- BBands(y)$up
y.BBands.m <- BBands(y)$mavg

The drawing code is as follows.

hc <- highchart(type="stock") %>% 
  hc_title(text="Charting Exchange Rates") %>% 
  hc_subtitle(text = "Data extracted using quantmod package") %>% 
  hc_yAxis_multiples(
    list(top = "0%", height = "50%", offset=0, opposite=TRUE),
    list(top = "50%", height = "50%", offset=0, opposite=TRUE)
  )%>%
  hc_add_series(x, id = "audjpy",name ="audjpy", yAxis=0, color="blue", lineWidth=1.5) %>%
  hc_add_series(x.BBands.ll, id = "audjpy.ll", name="audjpy Lower BBands",yAxis=0,
                color="black",dashStyle='shortdash', lineWidth=1) %>%
  hc_add_series(x.BBands.ul, id = "audjpy.ul", name="audjpy Upper BBands",yAxis=0,
                color="black",lineWidth=1) %>%
  hc_add_series(x.BBands.m, id = "audjpy.m",name="audjpy BBands MA", yAxis=0,
                color="red",lineWidth=1) %>%
  hc_add_series(y, id = "gbpusd",name="gbpusd",yAxis=1, color="green", lineWidth=1.5) %>%
  hc_add_series(y.BBands.ll, id = "gbpusd.ll",name="gbpusd Lower BBands", yAxis=1,
                color="black",dashStyle='shortdash',lineWidth=1) %>%
  hc_add_series(y.BBands.ul, id = "gbpusd.ul",name="gbpusd Upper BBands", yAxis=1,
                color="black",lineWidth=1) %>%
  hc_add_series(y.BBands.m, id = "gbpusd.m",name="gbpusd BBands MA", yAxis=1,
                color="red",lineWidth=1) %>%
  hc_add_theme(hc_theme_538())
hc

Q4

Load libraries and check the raw data. And make ffdf after converting character columns to factor columns in original df.

library(nycflights13)
library(ffbase)
library(ffbase2)
library(biglm)
library(pROC)
library(chron)

tmp <- flights
tmp$carrier <- as.factor(tmp$carrier)
tmp$tailnum <- as.factor(tmp$tailnum)
tmp$origin <- as.factor(tmp$origin)
tmp$dest <- as.factor(tmp$dest)

flightff <- as.ffdf(tmp)

Next I make new columns as follows

flightff$Delay <- ffifelse(flightff$dep_delay > 0 | flightff$dep_delay == 0 , 1,0)
flightff$DepHour <- flightff$hour
flightff$Car <- ffifelse(flightff$carrier %in% as.factor(c("DL","US","DH","UA")), 1, 0)
flightff$Night <- ffifelse(flightff$hour > 18 | flightff$hour < 6, 1, 0)
flightff$Weekend <- ffifelse(day.of.week(month=flightff$month, day=flightff$day, year=flightff$year) == 6, 1, 0)

I exclude the rows whose Delay values are NA and rename it to logitff. And then I split the dataset into train set and test set.

logitff <- flightff[!is.na(flightff$Delay),]
indx <- ff(1:nrow(logitff))
p <- 0.7
trainIndx <- ff(indx[1:trunc(length(indx)*p)])
trainset <- logitff[trainIndx,]
testIndx <- ff(indx[(trunc(length(indx)*p)+1):length(indx)])
testset <- logitff[testIndx,]

Logistic regression

fit <- bigglm.ffdf(Delay~DepHour+Car+Night+Weekend, data = trainset, family=binomial(), sandwich=TRUE)
summary(fit)
Large data regression model: bigglm(Delay ~ DepHour + Car + Night + Weekend, data = trainset, 
    family = binomial(), sandwich = TRUE)
Sample size =  229964 
               Coef    (95%     CI)     SE p
(Intercept) -1.5418 -1.5731 -1.5104 0.0157 0
DepHour      0.1020  0.0996  0.1044 0.0012 0
Car         -0.0680 -0.0858 -0.0503 0.0089 0
Night       -0.2619 -0.2912 -0.2326 0.0147 0
Weekend     -0.1543 -0.1821 -0.1265 0.0139 0
Sandwich (model-robust) standard errors

predict and make confusionmatrix in train_set

train_pred <- predict(fit, newdata = trainset, type="response")
train_pred <- ifelse(train_pred>0.5, 1,0)
train_confusion <- table(as.integer(as.data.frame(trainset)$Delay), as.integer(train_pred))
train_confusion <- addmargins(train_confusion)
train_confusion
     
           0      1    Sum
  0    94128  36392 130520
  1    56845  42599  99444
  Sum 150973  78991 229964

predict and make confusionmatrix in test_set

test_pred <- predict(fit, newdata = testset, type="response")
test_pred <- ifelse(test_pred>0.5, 1,0)
test_confusion <- table(as.integer(as.data.frame(testset)$Delay), as.integer(test_pred))
test_confusion <- addmargins(test_confusion)
test_confusion
     
          0     1   Sum
  0   39997 13058 53055
  1   25558 19944 45502
  Sum 65555 33002 98557

Draw ROC curve

test_pred <- predict(fit, newdata = testset, type="response")
roc <- roc(as.integer(as.data.frame(testset)$Delay), as.numeric(test_pred))
plot(roc)

Q5

First I load the data. And before using spark I delete the irrelevant columns.

Remove the observations satisfying the condition

Split this data into trainset and testset.

test
$test
NA

Use Decision tree

decision_tree <- train %>%
  ml_decision_tree(response="BOROUGH", features = c("LATITUDE","LONGITUDE"), max.bins = 200L, max.depth = 10L, seed=123L) %>%

Prediction

table(pred$BOROUGH, pred$prediction)
               
                    0     1     2     3     4
  BRONX             0     0     0  8876     0
  BROOKLYN      21181     0    24     0     1
  MANHATTAN         0 17944     0     0     0
  QUEENS           25     1 17777     0     0
  STATEN ISLAND    23     0     0     0  3169
---
title: "Big Data Analytics Assignment 1"
output:
  html_notebook: default
  html_document: default
  pdf_document: default
---

## Q1

First I load the data files
```{r}
library(tidyverse)
dat <- read.csv("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/migration2012.csv")
states <- read.csv("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/states_chord.csv")

dat <- dat %>%
tidyr::gather(key=From, value=value, Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,District.of.Columbia,Florida,Georgia,Hawaii,Idaho,Illinois,Indiana,Iowa,Kansas,Kentucky,Louisiana,Maine,Maryland,Massachusetts,Michigan,Minnesota,Mississippi,Missouri,Montana,Nebraska,Nevada,New.Hampshire,New.Jersey,New.Mexico,New.York,North.Carolina,North.Dakota,Ohio,Oklahoma,Oregon,Pennsylvania,Rhode.Island,South.Carolina,South.Dakota,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West.Virginia,Wisconsin,Wyoming)
```

Delete the puctuations.
```{r}
dat <- data.frame(apply(dat, 2, function(y) gsub("[[:punct:]]", " ", y)))
dat
```

Then I add two new columns to the above data frame.

```{r}
d1 <- merge(dat, states, by.x="From", by.y="States")
d1 <- d1[,c(-5,-6)]
names(d1)[4] <- "ID_From"

d2 <- merge(d1, states, by.x="To", by.y="States")
d2 <- d2[,c(-6,-7)]
names(d2)[5] <- "ID_To"
dat <- d2
dat
```

Sort the data frame in the ascending order

```{r}
dat <- arrange(dat, ID_From)
dat <- arrange(dat, ID_To)
dat
```

Convert the above data frame into the matrix form

```{r}
t1 <- dat[,c(1,2,4,5)]
t1 <- t1 %>%
  spread(key=To, value=ID_To)

t2 <- dat[,c(1,2,3,4)]
t2 <- t2 %>%
  spread(key=To, value=value)

t3 <- rbind(t1[1,],t2)
t3$From <- as.character(t3$From)
t3[1,2] <- 0
t3[1,1] <- "ID_To"
t3 <- t3 %>%
  arrange(ID_From)

library(data.table)
setcolorder(t3,c("From","ID_From","Connecticut","Maine","Massachusetts","New Hampshire","Rhode Island",
"Vermont","New Jersey","New York","Pennsylvania","Illinois","Indiana",
"Michigan","Ohio","Wisconsin","Iowa","Kansas","Minnesota",
"Missouri","Nebraska","North Dakota","South Dakota","Delaware","Florida",
"Georgia","Maryland","North Carolina","South Carolina","Virginia","District of Columbia",
"West Virginia","Alabama","Kentucky","Mississippi","Tennessee","Arkansas",
"Louisiana","Oklahoma","Texas","Arizona","Colorado","Idaho",
"Montana","Nevada","New Mexico","Utah","Wyoming","Alaska",
"California","Hawaii","Oregon","Washington"))

t3 <- t3[c(-1),c(-2)]

t4 <- t3[,-1]
rownames(t4) <- t3[,1]

t5 <- data.frame(apply(t4, 2, function(y) as.numeric(y)))
rownames(t5) <- t3[,1]
t5
```

Drawing the chord diagram
```{r}
library(chorddiag)

chorddiag(as.matrix(t5),groupColors=states$Color,showTicks=F,groupnamePadding=20,groupThickness=.05,groupnameFontsize=10)
```


## Q2

Read "Stops On Lines" and the all GIS data of bus lines.
```{r}
library(dplyr)
library(sp)
library(rgdal)
library(leaflet)
library(ggmap)

# Bus Stops
SOL <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/BusStops1216","StopsOnLines1216")
SOL.pj <- spTransform(SOL, CRS("+proj=longlat +datum=WGS84"))

# Bus Routes
CC <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/ComCir1216","ComCir1216")
CC.pj <- spTransform(CC, CRS("+proj=longlat +datum=WGS84"))
LE <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/LimExp1216","LimExp1216")
LE.pj <- spTransform(LE, CRS("+proj=longlat +datum=WGS84"))
LCBD <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/LocalCBD1216","LocalCBD1216")
LCBD.pj <- spTransform(LCBD, CRS("+proj=longlat +datum=WGS84"))
LNCBD <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/LocalNonCBD1216","LocalNonCBD1216")
LNCBD.pj <- spTransform(LNCBD, CRS("+proj=longlat +datum=WGS84"))
RBRT <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/RapidBRT1216","RapidBRT1216")
RBRT.pj <- spTransform(RBRT, CRS("+proj=longlat +datum=WGS84"))

tmp_CC <- geometry(CC.pj)
tmp_LE <- geometry(LE.pj)
tmp_LCBD <- geometry(LCBD.pj)
tmp_LNCBD <- geometry(LNCBD.pj)
tmp_RBRT <- geometry(RBRT.pj)

# Individual Bus Route
pjs <- list()
tmps <- list()
layer_list <- ogrListLayers("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/Individuals1216")
for (i in layer_list){
  if (i != 728){
    a <- spTransform(readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/Individuals1216", toString(i)), CRS("+proj=longlat +datum=WGS84"))
    pjs <- c(pjs, a)
    tmps <- c(tmps, geometry(a))
  }
}
```

make Line_list
```{r}
tmps <- c(tmps, tmp_CC, tmp_LE, tmp_LCBD, tmp_LNCBD, tmp_RBRT)

Line_list <- list()
for (i in 1:length(tmps)){
  for (j in 1:length(tmps[[i]])){
    Line_list <- c(Line_list, tmps[[i]][j]@lines[[1]]@Lines)
  }
}
```


make new_id
```{r}
pjs <- c(pjs, CC.pj, LE.pj, LCBD.pj, LNCBD.pj, RBRT.pj)
LinLSs <- list()
for (i in 1:length(pjs)){
 LinLSs <- c(LinLSs, sapply(pjs[[i]]@lines, function(x) length(x@Lines)))
}
LinLSs <- LinLSs %>% unlist()

new_id <- sapply(1:length(LinLSs), function(x) paste0(x, "_", seq.int(LinLSs[[x]]))) %>% 
  unlist()
```

make data frame
```{r}
## make a new data.frame (only route_id)
DAT=data.frame(matrix(rep(NA,1),nrow=1))[-1,]
for (i in 1:length(pjs)){
  df <- data.frame(route_id = pjs[[i]]@data$VAR_IDENT)
  DAT <- rbind(DAT, df)
}
rownames(DAT) <- new_id

SLDF <- mapply(function(x, y) Lines(x, ID = y), x = Line_list, y = new_id) %>%
  #list() %>%
  SpatialLines() %>% 
  SpatialLinesDataFrame(data = DAT)
```


make new lines and LA map
```{r}
dat <- geocode('Los Angels')

leaflet() %>%
  setView(lng = dat['lon'], lat = dat['lat']	, zoom = 11) %>%
  addPolylines(data = SLDF, color = "black", opacity = 1, weight = 1) %>% 
  addCircles(data=SOL.pj@data,~LONG, ~LAT, color = "red", weight = 0.3) %>%
  addTiles()
```


## Q3

First I load the data.
```{r}
library(quantmod)
library(highcharter)

x <- getSymbols("AUD/JPY", src = "oanda", auto.assign = FALSE)
y <- getSymbols("GBP/USD", src = "oanda", auto.assign = FALSE)
```

Next make Bollinger's bands for each exchange rate.
```{r}
x.BBands.ll <- BBands(x)$dn
x.BBands.ul <- BBands(x)$up
x.BBands.m <- BBands(x)$mavg
y.BBands.ll <- BBands(y)$dn
y.BBands.ul <- BBands(y)$up
y.BBands.m <- BBands(y)$mavg
```

The drawing code is as follows.
```{r}
hc <- highchart(type="stock") %>% 
  hc_title(text="Charting Exchange Rates") %>% 
  hc_subtitle(text = "Data extracted using quantmod package") %>% 
  hc_yAxis_multiples(
    list(top = "0%", height = "50%", offset=0, opposite=TRUE),
    list(top = "50%", height = "50%", offset=0, opposite=TRUE)
  )%>%
  hc_add_series(x, id = "audjpy",name ="audjpy", yAxis=0, color="blue", lineWidth=1.5) %>%
  hc_add_series(x.BBands.ll, id = "audjpy.ll", name="audjpy Lower BBands",yAxis=0,
                color="black",dashStyle='shortdash', lineWidth=1) %>%
  hc_add_series(x.BBands.ul, id = "audjpy.ul", name="audjpy Upper BBands",yAxis=0,
                color="black",lineWidth=1) %>%
  hc_add_series(x.BBands.m, id = "audjpy.m",name="audjpy BBands MA", yAxis=0,
                color="red",lineWidth=1) %>%
  hc_add_series(y, id = "gbpusd",name="gbpusd",yAxis=1, color="green", lineWidth=1.5) %>%
  hc_add_series(y.BBands.ll, id = "gbpusd.ll",name="gbpusd Lower BBands", yAxis=1,
                color="black",dashStyle='shortdash',lineWidth=1) %>%
  hc_add_series(y.BBands.ul, id = "gbpusd.ul",name="gbpusd Upper BBands", yAxis=1,
                color="black",lineWidth=1) %>%
  hc_add_series(y.BBands.m, id = "gbpusd.m",name="gbpusd BBands MA", yAxis=1,
                color="red",lineWidth=1) %>%
  hc_add_theme(hc_theme_538())

hc
```


## Q4

Load libraries and check the raw data. And make ffdf after converting character columns to factor columns in original df.
```{r}
library(nycflights13)
library(ffbase)
library(ffbase2)
library(biglm)
library(pROC)
library(chron)

tmp <- flights
tmp$carrier <- as.factor(tmp$carrier)
tmp$tailnum <- as.factor(tmp$tailnum)
tmp$origin <- as.factor(tmp$origin)
tmp$dest <- as.factor(tmp$dest)

flightff <- as.ffdf(tmp)
```

Next I make new columns as follows
```{r}
flightff$Delay <- ffifelse(flightff$dep_delay > 0 | flightff$dep_delay == 0 , 1,0)
flightff$DepHour <- flightff$hour
flightff$Car <- ffifelse(flightff$carrier %in% as.factor(c("DL","US","DH","UA")), 1, 0)
flightff$Night <- ffifelse(flightff$hour > 18 | flightff$hour < 6, 1, 0)
flightff$Weekend <- ffifelse(day.of.week(month=flightff$month, day=flightff$day, year=flightff$year) == 6, 1, 0)
```

I exclude the rows whose Delay values are NA and rename it to logitff.
And then I split the dataset into train set and test set.
```{r}
logitff <- flightff[!is.na(flightff$Delay),]

indx <- ff(1:nrow(logitff))
p <- 0.7
trainIndx <- ff(indx[1:trunc(length(indx)*p)])
trainset <- logitff[trainIndx,]
testIndx <- ff(indx[(trunc(length(indx)*p)+1):length(indx)])
testset <- logitff[testIndx,]
```

Logistic regression 
```{r}
fit <- bigglm.ffdf(Delay~DepHour+Car+Night+Weekend, data = trainset, family=binomial(), sandwich=TRUE)
summary(fit)
```

predict and make confusionmatrix in train_set
```{r}
train_pred <- predict(fit, newdata = trainset, type="response")
train_pred <- ifelse(train_pred>0.5, 1,0)
train_confusion <- table(as.integer(as.data.frame(trainset)$Delay), as.integer(train_pred))
train_confusion <- addmargins(train_confusion)
train_confusion
```

predict and make confusionmatrix in test_set
```{r}
test_pred <- predict(fit, newdata = testset, type="response")
test_pred <- ifelse(test_pred>0.5, 1,0)
test_confusion <- table(as.integer(as.data.frame(testset)$Delay), as.integer(test_pred))
test_confusion <- addmargins(test_confusion)
test_confusion
```

Draw ROC curve
```{r}
test_pred <- predict(fit, newdata = testset, type="response")
roc <- roc(as.integer(as.data.frame(testset)$Delay), as.numeric(test_pred))
plot(roc)
```


## Q5

First I load the data. And before using spark I delete the irrelevant columns.
```{r}
library(sparklyr)
library(dplyr)
library(readr)
sc <- spark_connect(master = "local")

# 元のcsvに問題があるっぽい？ → Unspecifiedが怒られてるっぽい　→　Unspecifiedを0に変えるか。→今度はPASSENGER VEHICKEが怒られる。
# 要するにStringが全部ダメっぽい。でもtitanicではStringも上手く渡せてる。なぜ？
# 二つの例から空白は別に問題ではないことがわかる。
dat <- read_csv("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/NYPD_Motor_Vehicle_Collisions.csv")
dat <- dat[,c("BOROUGH","LATITUDE","LONGITUDE","UNIQUE KEY")]
nypd_tbl <- copy_to(sc, dat, "nypd_tbl",overwrite = TRUE)
```

Remove the observations satisfying the condition
```{r}
nypd_tbl <- nypd_tbl %>%
  filter(BOROUGH!="",!is.na(LATITUDE),!is.na(LONGITUDE),LATITUDE!=0,LONGITUDE!=0)
nypd_tbl
```

Split this data into trainset and testset.
```{r}
partitions <- nypd_tbl %>%
  sdf_partition(training = 0.9, test = 0.1, seed = 123)
train <- partitions[1]$training
test <- partitions[2]$test
```

Use Decision tree
```{r}
decision_tree <- train %>%
  ml_decision_tree(response="BOROUGH", features = c("LATITUDE","LONGITUDE"), max.bins = 200L, max.depth = 10L, seed=123L) %>%
```

Prediction
```{r}
pred <- sdf_predict(decision_tree, test) %>%
  collect

table(pred$BOROUGH, pred$prediction)
```
